Marginal distributions of Census characteristics

Author
Affiliation

Honorary Professor, University of Sydney

Published

1:19PM 30 May 2023

Code
import { aq, op } from "@uwdata/arquero"

We build data sets that contain the marginal distribution of the CURF variables, at various levels of spatial resolution. We begin with CEDs. We use ABS TableBuilder to first subset to adult Australian citizens to provide the marginal distributions of these variables.

We do this for CED, SED and SA1 levels.

1 Constants, utility functions

2 Map CEDs to CURF SA4 agglomerations

From TableBuilder we have counts of adult citizens in CED/SA4 combinations. Many of these will be zero.

Code
ced_sa4 <- ced_read_function("SA4")
sed_sa4 <- sed_read_function("SA4")

sa4 <- bind_rows(ced_sa4,sed_sa4) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "SA4_name",
               values_to = "n") %>%
  filter(!grepl(pattern = "^\\.{3}", SA4_name)) %>%
  filter(SA4_name != "Total") %>%
  filter(n > 0)

load(here("common_data/Census/2016/curf/sa4_codes.RData"))

sa4 <- sa4 %>%
  left_join(sa4_codes %>%
              mutate(
                SA4_CODE21 = str_extract(pattern = "^[0-9]{3}", SA4_class_code),
                SA4_name = str_remove(pattern = "^[0-9]{3} ", SA4_class_code)
              ),
            by = "SA4_name")

areaenum <- sa4 %>% 
  group_by(state,Division,AREAENUM) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)*100) %>% 
  ungroup() %>% 
  left_join(sa4_codes %>% distinct(AREAENUM,description),
            by = "AREAENUM")

3 Religion

Code
## RELP_collapsed
relp <- bind_rows(
  ced_read_function("RELP_COLLAPSED"),
  sed_read_function("RELP")
  ) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "RELP",
               values_to = "n") %>% 
  filter(!grepl(pattern = "^\\.{3}",RELP)) %>% 
  filter(RELP != "Total") %>% 
  mutate(
    RELP_collapsed = case_when(
      RELP %in% c("Anglican",
                  "Baptist",
                  "Brethren",
                  "Churches of Christ",
                  "Jehovah's Witnesses",
                  "Latter-day Saints",
                  "Lutheran",
                  "Other Protestant",
                  "Pentecostal",
                  "Uniting Church",
                  "Salvation Army",
                  "Seventh-day Adventist") ~ "Protestant",
      
      RELP == "Eastern Orthodox" ~ "Greek Orthodox",
      
      RELP == "Presbyterian and Reformed" ~ "Presbyterian",
      
      RELP %in% c("No Religion (so described)",
                  "No Religion, (so described)",
                  "Other Spiritual Beliefs",
                  "Secular Beliefs",
                  "Secular Beliefs and Other Spiritual Beliefs and No Religious Affiliation, nfd",
                  "Secular Beliefs and Other Spiritual Beliefs") ~ "No Religion (so described)",
      
      RELP %in% c("Christianity, nfd", "Other Christian") ~ "Other Christian",
      
      RELP %in% c("Buddhism","Catholic","Hinduism","Islam") ~ RELP,
      
      RELP %in% c("Not stated","Inadequately described") ~ NA,
      
      TRUE ~ "Judaism and Other Religions" 
    ),
    
    RELP_collapsed = factor(RELP_collapsed)
  )
Code
adjustment_missing <- curf %>%
  count(RELP_collapsed) %>%
  mutate(RELP_collapsed = if_else(is.na(RELP_collapsed), "Missing", RELP_collapsed)) %>%
  filter(RELP_collapsed != "Overseas visitor") %>%
  mutate(p = n / sum(n) * 100) %>%
  arrange(desc(p)) %>%
  left_join(test_data %>%
              count(RELP_collapsed) %>%
              mutate(p = n / sum(n) * 100) %>%
              arrange(desc(p)),
            by = "RELP_collapsed") %>% 
  mutate(w = (p.y - p.x)/p.x[RELP_collapsed == "Missing"],
         w = if_else(w < 0, 0, w),
         w = w/sum(w,na.rm = TRUE))

adjust <- function(data){
  z <- left_join(data,
                 adjustment_missing %>%
                   select(RELP_collapsed,w),
                 by = "RELP_collapsed") %>%
    mutate(n_adj = n + w*n[is.na(RELP_collapsed)])
  
  return(z)
}

relp <- relp %>% 
  group_by(state,Division,RELP_collapsed) %>% 
  summarise(n = sum(n, na.rm = TRUE)) %>% 
  ungroup() %>%
  group_nest(state,Division) %>% 
  mutate(d = map(.x = data,
                 .f = ~adjust(.x))
         ) %>% 
  ungroup() %>% 
  unnest(d) %>% 
  select(-data,-w) %>%
  filter(!is.na(RELP_collapsed)) %>% 
  rename(n_raw = n,
         n = n_adj)
Code
relp_wide <- relp %>%
  select(state,Division, RELP_collapsed, n) %>%
  group_by(state,Division) %>%
  mutate(p = n / sum(n) * 100) %>%
  ungroup() %>%
  select(state,Division, RELP_collapsed, p) %>%
  pivot_wider(id_cols = c("state","Division"),
              names_from = "RELP_collapsed",
              values_from = "p") %>%
  rename(Other = 'Judaism and Other Religions',
         Orthodox = "Greek Orthodox",
         None = "No Religion (so described)")
ojs_define(relp_wide_raw = relp_wide)
Code
relp_tab = transpose(relp_wide_raw)
Inputs.table(relp_tab,
  {
    columns: [ "state", "Division", "None", "Catholic", "Orthodox", 
              "Protestant", "Presbyterian", "Other Christian",
              "Other", "Buddhism", "Hinduism", "Islam", "Other" ],
    format: {
      Buddhism: x => d3.format(".1f")(x),
      Catholic: x => d3.format(".1f")(x),
      Orthodox: x => d3.format(".1f")(x),
      Hinduism: x => d3.format(".1f")(x),
      Islam: x => d3.format(".1f")(x),
      Other: x => d3.format(".1f")(x),
      None: x => d3.format(".1f")(x),
      "Other Christian": x => d3.format(".1f")(x),
      "Presbyterian": x => d3.format(".1f")(x),
      Protestant: x => d3.format(".1f")(x)
    }
  }
  )

Closeness (total variation norm) to population distribution:

Code
ojs_define(
  relp_tv = left_join(
    relp %>%
      select(state,Division, RELP_collapsed, n) %>%
      group_by(state,Division) %>%
      mutate(p = n / sum(n) * 100) %>%
      ungroup() %>%
      select(state,Division, RELP_collapsed, p),
    test_data %>%
      count(RELP_collapsed) %>%
      mutate(p = n / sum(n) * 100),
    by = "RELP_collapsed") %>% 
    group_by(state,Division) %>%
    summarise(tv = sum(abs(p.x - p.y))) %>%
    ungroup() %>%
    arrange(tv)
  )
Code
Inputs.table(transpose(relp_tv))

3.1 SA1

Code
sa1_relp <- sa1_read_function(v="RELP")

4 Sex

Code
sexp <- bind_rows(
  ced_read_function("SEXP"),
  sed_read_function("SEXP")
  ) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "SEXP",
               values_to = "n") %>% 
  filter(!grepl(pattern = "^\\.{3}",SEXP)) %>% 
  filter(SEXP != "Total") %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)*100) %>% 
  ungroup() 
Code
sexp %>%   
  filter(SEXP == "Female") %>% 
  ggplot(.,
         aes(x = p)) +
  geom_histogram() + 
  theme_minimal(base_family = "Avenir Next")

Code
sexp_tab <- sexp %>% 
  pivot_wider(id_cols = c("state","Division"),
              names_from = "SEXP",
              values_from = c("n","p")
              )
ojs_define(sexp_tab_raw = sexp_tab)              
Code
sexp_tab = transpose(sexp_tab_raw)

Inputs.table(sexp_tab)

4.1 SA1

Code
sa1_sexp <- sa1_read_function("SEXP") %>% 
  group_by(SA1) %>% 
  mutate(p = n/sum(n) * 100) %>% 
  ungroup()

5 Education

Two files are read here. First, HEAP or level of Highest Educational Attainment.

Code
heap <- bind_rows(
  ced_read_function("HEAP"),
  sed_read_function("HEAP")
  ) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "j",
               values_to = "n") %>% 
  filter(j != "Total") %>% 
  group_by(state,Division) %>% mutate(p = n/sum(n)) %>% ungroup()

Second, we also extract counts of those having completed high school from Census variable HSCP:

Code
hs <- bind_rows(
  ced_read_function("HSCP"),
  sed_read_function("HSCP")
  ) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "j",
               values_to = "n") %>% 
  filter(j != "Total") %>% 
  group_by(state,Division) %>%
  mutate(p = n/sum(n)) %>%
  ungroup()

Third, we extract counts of adult citizens with university levels qualifications:

Code
uni <- heap %>% 
  mutate(
    uni = if_else(j %in% c("Bachelor Degree Level",
                                "Graduate Diploma and Graduate Certificate Level",
                                "Postgraduate Degree Level"),
                       "Yes",
                       "No")
  ) %>% 
  group_by(state,Division,uni) %>% 
  summarise(n = sum(n)) %>% 
  ungroup()

5.1 SA1

Code
sa1_heap <- sa1_read_function("HEAP")
sa1_hscp <- sa1_read_function("HSCP")

6 Indigenous indicator (INCG_binary)

Code
ingp <- bind_rows(
  ced_read_function("INGP"),
  sed_read_function("INGP")
  ) %>% 
  pivot_longer(cols = -c(Division,state),
               names_to = "INGP",
               values_to = "n") %>% 
  filter(INGP != "Total") %>% 
  mutate(INGP_binary = INGP %in% 
           c("Aboriginal",
             "Torres Strait Islander",
             "Both Aboriginal and Torres Strait Islander")
         ) %>% 
  group_by(state,Division,INGP_binary) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)*100) %>% 
  ungroup() 
Code
ojs_define(ingp_raw = ingp %>%
             pivot_wider(id_cols = Division,
                         names_from = "INGP_binary",
                         values_from = c("p")
                         )
           )
Code
Inputs.table(transpose(ced_ingp_raw))

7 Income (INCP_continuous)

Code
incp <- bind_rows(
  ced_read_function("INCP"),
  sed_read_function("INCP")
  ) %>%  
  pivot_longer(cols = !Division & !state,
               names_to = "INCP",
               values_to = "n") %>% 
  filter(INCP != "Total") %>% 
  mutate(
    INCP = str_remove(pattern = "^\\$.*\\(", INCP),
    INCP = str_remove(pattern = "\\)$", INCP)
  )
    
incp_continuous <- incp %>%
  mutate(
    INCP_continuous = case_when(
      INCP %in% c("Negative income",
                  "Nil income",
                  "$1-$7,799",
                  "$7,800-$15,599") ~ 1,
      ## "Less than $15,000 per year"
      INCP %in% c("$15,600-$20,799",
                  "$20,800-$25,999") ~ 2,
      ## "$15,001 to $25,000 per year"
      INCP %in% c("$26,000-$33,799") ~ 3,
      ## "$25,001 to $35,000 per year"
      INCP %in% c("$33,800-$41,599") ~ 4,
      ## "$35,001 to $45,000 per year"
      INCP %in% c("$41,600-$51,999",
                  "$52,000-$64,999") ~ 5,
      ## "$45,001 to $60,000 per year"
      INCP %in% "$65,000-$77,999" ~ 6,
      ## "$60,001 to $80,000 per year"
      INCP %in% c("$78,000-$90,999",
                  "$91,000-$103,999") ~ 7,
      ## "$80,001 to $100,000 per year"
      INCP == "$104,000-$155,999" ~ 8,
      INCP %in% c("$156,000-$181,999", "$182,000 or more") ~ 9,
      TRUE ~ 999
    )
  ) %>%
  group_by(state, Division, INCP_continuous) %>%
  summarise(n = sum(n)) %>%
  ungroup() %>%
  group_by(state, Division) %>%
  mutate(p = n / sum(n) * 100) %>%
  ungroup() 


adjustment_missing <- curf %>%
  filter(INCP_original != "Overseas visitor") %>%
  filter(!(AGEP %in% as.character(0:17))) %>%
  mutate(
    INCP_continuous = case_when(
      INCP_original %in% c("Negative income",
                  "Nil income",
                  "$1-$7,799",
                  "$7,800-$15,599") ~ 1,   ## "Less than $15,000 per year"
      INCP_original %in% c("$15,600-$20,799",
                  "$20,800-$25,999") ~ 2, ## "$15,001 to $25,000 per year"
      INCP_original %in% c("$26,000-$33,799") ~ 3, ## "$25,001 to $35,000 per year"
      INCP_original %in% c("$33,800-$41,599") ~ 4, ## "$35,001 to $45,000 per year"
      INCP_original %in% c("$41,600-$51,999",
                  "$52,000-$64,999") ~ 5, ## "$45,001 to $60,000 per year"
      INCP_original %in% "$65,000-$77,999" ~ 6,    ## "$60,001 to $80,000 per year"
      INCP_original %in% c("$78,000-$90,999",
                  "$91,000-$103,999") ~ 7, ## "$80,001 to $100,000 per year"
      INCP_original == "$104,000-$155,999" ~ 8,
      INCP_original %in% c("$156,000>") ~ 9,
      TRUE ~ 999)
  ) %>% 
  count(INCP_continuous) %>%
  mutate(p = n / sum(n) * 100) %>%
  arrange(desc(p)) %>%
  left_join(test_data %>%
              count(INCP_continuous) %>%
              mutate(p = n / sum(n) * 100) %>%
              arrange(desc(p)),
            by = "INCP_continuous") %>% 
  mutate(w = (p.y - p.x)/p.x[INCP_continuous == 999],
         w = if_else(w < 0, 0, w),
         w = w/sum(w,na.rm = TRUE))

adjust <- function(data){
  z <- left_join(data,
                 adjustment_missing %>%
                   select(INCP_continuous,w),
                 by = "INCP_continuous") %>%
    mutate(n_adj = n + w*n[INCP_continuous == 999])
  
  return(z)
}

incp_continuous_adjusted <- incp_continuous %>%
  group_nest(state,Division) %>% 
  mutate(d = map(.x = data,
                 .f = ~adjust(.x))
         ) %>% 
  ungroup() %>% 
  unnest(d) %>% 
  select(-data,w) %>%
  filter(!is.na(INCP_continuous)) %>% 
  rename(n_raw = n,
         n = n_adj) %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n,na.rm = TRUE)*100) %>% 
  ungroup()
Code
ojs_define(incp_raw = incp_continuous %>%
              pivot_wider(id_cols = c("state","Division"),
                          names_from = "INCP_continuous",
                          values_from = "p")
           )
Code
Inputs.table(transpose(incp_raw),
  {
    columns: [ "state", "Division", "1", "2", "3", "4", "5", "6", "7", "8", "9" ],
    format: {
      "1": x => d3.format(".1f")(x),
      "2": x => d3.format(".1f")(x),
      "3": x => d3.format(".1f")(x),
      "4": x => d3.format(".1f")(x),
      "5": x => d3.format(".1f")(x),
      "6": x => d3.format(".1f")(x),
      "7": x => d3.format(".1f")(x),
      "8": x => d3.format(".1f")(x),
      "9": x => d3.format(".1f")(x)
    }
  }
)

Closeness (total variation norm) to population distribution:

Code
ojs_define(incp_tv = left_join(incp_continuous %>%
                            filter(INCP_continuous != 999),
                          test_data %>%
                            count(INCP_continuous) %>%
                            mutate(p = n/sum(n)*100),
                          by = "INCP_continuous") %>%
    group_by(state,Division) %>%
    summarise(tv = sum(abs(p.x - p.y))) %>%
    ungroup() %>%
    arrange(tv)
  )
Code
Inputs.table(transpose(incp_tv))

7.1 SA1

Code
sa1_incp <- sa1_read_function("INCP") %>% 
   mutate(
    j = str_remove(pattern = "^\\$.*\\(", j),
    j = str_remove(pattern = "\\)$", j)
  )


sa1_incp_continuous <- sa1_incp %>%
  mutate(
    k = case_when(
      j %in% c("Negative income",
                  "Nil income",
                  "$1-$7,799",
                  "$7,800-$15,599") ~ 1,
      ## "Less than $15,000 per year"
      j %in% c("$15,600-$20,799",
                  "$20,800-$25,999") ~ 2,
      ## "$15,001 to $25,000 per year"
      j %in% c("$26,000-$33,799") ~ 3,
      ## "$25,001 to $35,000 per year"
      j %in% c("$33,800-$41,599") ~ 4,
      ## "$35,001 to $45,000 per year"
      j %in% c("$41,600-$51,999",
                  "$52,000-$64,999") ~ 5,
      ## "$45,001 to $60,000 per year"
      j %in% "$65,000-$77,999" ~ 6,
      ## "$60,001 to $80,000 per year"
      j %in% c("$78,000-$90,999",
                  "$91,000-$103,999") ~ 7,
      ## "$80,001 to $100,000 per year"
      j == "$104,000-$155,999" ~ 8,
      j %in% c("$156,000-$181,999", "$182,000 or more") ~ 9,
      TRUE ~ 999
    )
  ) %>%
  group_by(SA1,k) %>%
  summarise(n = sum(n)) %>%
  ungroup() %>%
  rename(j = k) %>% 
  group_by(SA1) %>%
  mutate(p = n / sum(n) * 100) %>%
  ungroup() 

sa1_incp_continuous_adjusted <- sa1_incp_continuous %>%
  rename(INCP_continuous = j) %>% 
  group_nest(SA1) %>% 
  mutate(d = map(.x = data,
                 .f = ~adjust(.x))
         ) %>% 
  ungroup() %>% 
  unnest(d) %>% 
  select(-data,w) %>%
  filter(!is.na(INCP_continuous)) %>% 
  rename(n_raw = n,
         n = n_adj) %>% 
  group_by(SA1) %>% 
  mutate(p = n/sum(n,na.rm = TRUE)*100) %>% 
  ungroup()

8 Household income

From the persons, place of enumeration database in TableBuilder, we extract HIND (total household income) for CEDs and SEDs.

Code
hind <- bind_rows(
  ced_read_function("HIND"),
  sed_read_function("HIND")
) %>%  
  pivot_longer(cols = !Division & !state,
               names_to = "HIND",
               values_to = "n") %>% 
  filter(HIND != "Total") %>% 
  mutate(
    HIND = str_remove(pattern = "^\\$.*\\(", HIND),
    HIND = str_remove(pattern = "\\)$", HIND)
  )

levs <- hind %>% distinct(HIND) %>% pull(HIND)

hind <- hind %>% mutate(HIND = factor(HIND,levels=levs,ordered = TRUE))

8.1 SA1

Code
sa1_hind <- sa1_read_function("HIND") %>% 
   mutate(
    j = str_remove(pattern = "^\\$.*\\(", j),
    j = str_remove(pattern = "\\)$", j)
  )

9 Occupation

Code
occp <- bind_rows(
  ced_read_function("OCCP"),
  sed_read_function("OCCP")
) %>%
  pivot_longer(cols = !Division & !state,
               names_to = "OCCP",
               values_to = "n") %>% 
  filter(OCCP != "Total") %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

10 Type of Government assistance

Code
igap <- bind_rows(
  ced_read_function("IGAP"),
  sed_read_function("IGAP")
) %>% 
    pivot_longer(cols = !Division & !state,
               names_to = "IGAP",
               values_to = "n") %>% 
  filter(IGAP != "Total") %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

11 Born country

Australia, UK/NZ, all others.

Code
born_country_three <-
  bind_rows(
    ced_read_function("born_country_three"),
    sed_read_function("born_country_three"),
  ) %>% 
  rename(Australia = `Australia (includes External Territories)`) %>% 
  mutate(`UK/NZ` = `New Zealand` + `United Kingdom, Channel Islands and Isle of Man`) %>% 
  select(state,Division,Australia,`UK/NZ`) %>% 
  left_join(
    sexp %>%
      group_by(state,Division) %>%
      summarise(Total = sum(n)) %>%
      ungroup(),
    by = c("state","Division")
    ) %>% 
  mutate(Other = Total - Australia - `UK/NZ`) %>% 
  select(-Total) %>%
  pivot_longer(cols = !Division & !state,
               names_to = "born_country_three",
               values_to = "n") %>% 
  group_by(state,Division) %>% 
  mutate(p=n/sum(n)*100) %>% 
  ungroup()
Code
ojs_define(born_country_three_raw = born_country_three %>% 
             pivot_wider(id_cols = Division,
                         names_from = born_country_three,
                         values_from = "p")
           )
Code
Inputs.table(transpose(born_country_three))

Closeness (total variation norm) to population distribution:

Code
ojs_define(
  bcp_tv = left_join(born_country_three,
                          test_data %>%
                            count(born_country_three) %>%
                            mutate(p = n/sum(n)*100),
                          by = "born_country_three") %>%
    group_by(Division) %>%
    summarise(tv = sum(abs(p.x - p.y))) %>%
    ungroup() %>%
    arrange(tv)
  )
Code
Inputs.table(transpose(bcp_tv))

12 AGE

Two TableBuilder files are used here, one with the 5 year buckets, the other to get the counts in discrete ages 18 through 24.

Code
age_five <- bind_rows(
  ced_read_function("AGE5P"),
  sed_read_function("AGE5P")
) %>%
  pivot_longer(cols = !Division & !state,
               names_to = "AGE5P",
               values_to = "n") %>% 
  filter(n > 0) %>% 
  filter(AGE5P != "Total") %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)*100) %>% 
  ungroup()

age_young <- bind_rows(
  ced_read_function("AGE_18_24",10),
  sed_read_function("AGE_18_24",10)
) %>%
  pivot_longer(cols = !Division & !state,
               names_to = "AGE_18_24",
               values_to = "n") %>% 
  filter(n > 0) %>% 
  filter(AGE_18_24 != "Total") %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)*100) %>% 
  ungroup()
  
agep <- bind_rows(
  age_young %>%
    rename(age = AGE_18_24),
  age_five %>% 
    rename(age = AGE5P) %>% 
    filter(age != "15-19 years") %>% 
    filter(age != "20-24 years") 
  ) %>% 
  mutate(
    AGEP = str_remove(pattern = " years.*$",age),
    AGEP = if_else(AGEP %in% c("85-89","90-94","95-99","100"),
                   "85>",
                   AGEP),
      AGEP = factor(
      AGEP,
      levels = c(
        as.character(
          0:24),
          "25-29",
          "30-34",
          "35-39",
          "40-44",
          "45-49",
          "50-54",
          "55-59",
          "60-64",
          "65-69",
          "70-74",
          "75-79",
          "80-84",
          "85>"),
      ordered = TRUE)
  ) %>% 
  group_by(state,Division,AGEP) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  select(state,Division,AGEP,n)

agep_continuous <- agep %>%
  mutate(
    AGE_continuous = case_when(
      AGEP %in% as.character(18:24) ~ as.numeric(AGEP),
      AGEP == "25-29" ~ 27,
      AGEP == "30-34" ~ 32,
      AGEP == "35-39" ~ 37,
      AGEP == "40-44" ~ 42,
      AGEP == "45-49" ~ 47,
      AGEP == "50-54" ~ 52,
      AGEP == "55-59" ~ 57,
      AGEP == "60-64" ~ 62,
      AGEP == "65-69" ~ 67,
      AGEP == "70-74" ~ 72,
      AGEP == "75-79" ~ 77,
      AGEP == "80-84" ~ 82,
      TRUE ~ 87
    )
  ) %>% 
  group_by(state,Division,AGE_continuous) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  select(state,Division,AGE_continuous,n)

12.1 SA1

Code
sa1_age5p <- sa1_read_function("AGE5P")

13 Ancestry original

Read hierarchy files from ABS:

Code
ancestry_hierarchy <- readxl::read_xls(
  path = here(
    "common_data/Census/2021/meta/ancestry_hierarchy.xls"
    ),
  sheet = 4,
  skip = 7,
  trim_ws = TRUE,
  col_names = c("row_name","a1","a2","a3")
  ) %>% 
  mutate(a1 = if_else(!grepl("[A_z]",a1),NA,a1)) %>% 
  tidyr::fill(a1) %>% 
  group_by(a1) %>% 
  tidyr::fill(a2, .direction = "up") %>% 
  mutate(a2 = if_else(grepl("^[0-9]",a2),NA,a2)) %>% 
  tidyr::fill(a2) %>% 
  tidyr::fill(a3, .direction = "up") %>%
  ungroup() %>% 
  select(-row_name) %>% 
  mutate(a1 = str_to_title(a1)) %>% 
  mutate(a1 = gsub(pattern = "And",
                   replacement = "and",
                   a1),
         a1 = gsub(pattern = "Of",
                   replacement = "of",
                   a1),
         a1 = gsub(pattern = "The",
                   replacement = "the",
                   a1),
         a3 = gsub(pattern = "\\, nec$",
                   replacement = "",
                   a3),
         a3 = gsub(pattern = "\\, nfd$",
                   replacement = "",
                   a3) 
         ) %>% 
  filter(a1 != "Total") %>% 
  filter(a2 != "Total") %>% 
  filter(a3 != "Total") %>% 
  distinct()
Code
ancp_raw <- bind_rows(
  ced_read_function("ANC1P"),
  sed_read_function("ANC1P")
) %>% 
  mutate(`Oceanian, nfd` = as.double(`Oceanian, nfd`)) %>% 
  pivot_longer(cols = where(is.double),
               names_to = "ancp",
               values_to = "n",
               values_drop_na = TRUE)

ancp_full <- ancp_raw %>% 
  mutate(ancp = gsub(pattern = "\\, nfd$",
                     replacement = "",
                     ancp)) %>% 
    mutate(ancp = gsub(pattern = "\\, nec$",
                     replacement = "",
                     ancp)) %>% 
    mutate(ancp = gsub(pattern = "\\, so described$",
                     replacement = "",
                     ancp)) %>% 
  filter(ancp != "Total") %>% 
  group_by(state,Division,ancp) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  left_join(ancestry_hierarchy,
            by = c("ancp" = "a3")
            ) %>% 
  mutate(
    a1 = if_else(ancp == "Creole",
                 "Peoples of the Americas",
                 a1),
    a1 = if_else(ancp == "African",
                 "Sub-Saharan African",
                 a1),
    a1 = if_else(ancp == "Asian",
                 "North-East Asian",
                 a1),
    a1 = if_else(ancp == "Caucasian" | ancp == "European" | ancp == "Eurasian",
                 "North-West European",
                 a1),
    a2 = if_else(
      is.na(a2),
      ancestry_hierarchy$a2[match(ancp,ancestry_hierarchy$a2)],
      a2),
    a1 = if_else(
      is.na(a1),
      ancestry_hierarchy$a1[match(ancp,ancestry_hierarchy$a2)],
      a1),
    ## try matching at level 1
    a2 = if_else(
      is.na(a2),
      ancestry_hierarchy$a2[match(ancp,ancestry_hierarchy$a1)],
      a2),
    a1 = if_else(
      is.na(a1),
      ancestry_hierarchy$a1[match(ancp,ancestry_hierarchy$a1)],
      a1),
    a1 = if_else(ancp %in% c("Inadequately described",
                             "Not stated"),
                 ancp,
                 a1),
    a2 = if_else(ancp %in% c("Inadequately described",
                             "Not stated"),
                 ancp,
                 a2)
  )

ancp_level1 <- ancp_full %>%
  group_by(state,Division,a1) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

13.1 SA1

Code
sa1_ancp_raw <- sa1_read_function("ANC1P")

sa1_ancp_full <- sa1_ancp_raw %>% 
  rename(ancp = j) %>% 
  mutate(ancp = gsub(pattern = "\\, nfd$",
                     replacement = "",
                     ancp)) %>% 
    mutate(ancp = gsub(pattern = "\\, nec$",
                     replacement = "",
                     ancp)) %>% 
    mutate(ancp = gsub(pattern = "\\, so described$",
                     replacement = "",
                     ancp)) %>% 
  filter(ancp != "Total") %>% 
  group_by(SA1,ancp) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  left_join(ancestry_hierarchy,
            by = c("ancp" = "a3")
            ) %>% 
  mutate(
    a1 = if_else(ancp == "Creole",
                 "Peoples of the Americas",
                 a1),
    a1 = if_else(ancp == "African",
                 "Sub-Saharan African",
                 a1),
    a1 = if_else(ancp == "Asian",
                 "North-East Asian",
                 a1),
    a1 = if_else(ancp == "Caucasian" | ancp == "European" | ancp == "Eurasian",
                 "North-West European",
                 a1),
    a2 = if_else(
      is.na(a2),
      ancestry_hierarchy$a2[match(ancp,ancestry_hierarchy$a2)],
      a2),
    a1 = if_else(
      is.na(a1),
      ancestry_hierarchy$a1[match(ancp,ancestry_hierarchy$a2)],
      a1),
    ## try matching at level 1
    a2 = if_else(
      is.na(a2),
      ancestry_hierarchy$a2[match(ancp,ancestry_hierarchy$a1)],
      a2),
    a1 = if_else(
      is.na(a1),
      ancestry_hierarchy$a1[match(ancp,ancestry_hierarchy$a1)],
      a1),
    a1 = if_else(ancp %in% c("Inadequately described",
                             "Not stated"),
                 ancp,
                 a1),
    a2 = if_else(ancp %in% c("Inadequately described",
                             "Not stated"),
                 ancp,
                 a2)
  )

sa1_ancp_level1 <- sa1_ancp_full %>%
  group_by(SA1,a1) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(SA1) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

14 Ancestry (ANCP_collapse)

Code
ancp_collapse <- bind_rows(
  ced_read_function("ANC1P"),
  sed_read_function("ANC1P")
) %>% 
  mutate(`Oceanian, nfd` = as.double(`Oceanian, nfd`)) %>% 
  pivot_longer(cols = where(is.double),
               names_to = "ancp",
               values_to = "n",
               values_drop_na = TRUE) %>% 
  mutate(
    ANCP_collapse = case_when(
      
      grepl("^Australian",ancp) ~ "Australian",
      
      grepl("^Chinese",ancp) ~ "Chinese",
      ancp == "North-East Asian, nfd" ~ "Chinese",
      
      grepl("^English",ancp) |
        grepl("^British",ancp) | 
        ancp=="Welsh" ~ "English",
      
      ancp == "German" ~ "German",
      
      ancp == "Irish" ~ "Irish",
      
      ancp == "Italian" ~ "Italian",
      
      ancp == "Jewish" ~ "Jewish",
      
      ancp == "Greek" ~ "Greek",
      
      ancp == "Scottish" ~ "Scottish",
      
      TRUE ~ "Other"
      
    )
  ) %>% 
  group_by(state,Division,ANCP_collapse) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(state,Division) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

14.1 SA1

Code
sa1_ancp <- sa1_read_function("ANC1P") %>% 
  rename(ancp = j) %>% 
    mutate(
    ANCP_collapse = case_when(
      
      grepl("^Australian",ancp) ~ "Australian",
      
      grepl("^Chinese",ancp) ~ "Chinese",
      ancp == "North-East Asian, nfd" ~ "Chinese",
      
      grepl("^English",ancp) |
        grepl("^British",ancp) | 
        ancp=="Welsh" ~ "English",
      
      ancp == "German" ~ "German",
      
      ancp == "Irish" ~ "Irish",
      
      ancp == "Italian" ~ "Italian",
      
      ancp == "Jewish" ~ "Jewish",
      
      ancp == "Greek" ~ "Greek",
      
      ancp == "Scottish" ~ "Scottish",
      
      TRUE ~ "Other"
      
    )
  ) %>% 
  group_by(SA1,ANCP_collapse) %>% 
  summarise(n = sum(n)) %>% 
  ungroup() %>% 
  group_by(SA1) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup() %>% 
  rename(j = ANCP_collapse)

15 Language spoken at home

Code
lanp <- bind_rows(
  ced_read_function("LANP_3"),
  sed_read_function("LANP_3")
  ) %>% 
  pivot_longer(cols = !Division & !state,
               names_to = "LANP",
               values_to = "n") %>% 
  filter(LANP != "Total")

15.1 SA1

Code
sa1_lanp <- sa1_read_function("LANP")

16 Save to file

Code
marginals <- bind_rows(
  relp %>%
    mutate(var = "RELP_collapsed") %>%
    rename(j = RELP_collapsed),
  
  sexp %>%
    mutate(var = "SEXP") %>%
    rename(j = SEXP),
  
  heap %>% mutate(var = "HEAP"),
  
  hs %>% mutate(var = "high_school"),
  
  uni %>% mutate(var = "uni") %>% 
    rename(j = uni),
  
  ingp %>% mutate(var = "INGP_binary") %>%
    rename(j = INGP_binary) %>%
    mutate(j = as.character(j)),
  
  incp %>% mutate(var = "INCP") %>%
    rename(j = INCP) %>%
    mutate(j = as.character(j)),
  
  incp_continuous_adjusted %>% mutate(var = "INCP_continuous") %>%
    rename(j = INCP_continuous) %>%
    mutate(j = as.character(j)),
  
  hind %>% mutate(var = "HIND") %>% 
    rename(j = HIND),
  
  igap %>% 
    mutate(var = "IGAP") %>% 
    rename(j = IGAP),
  
  occp %>% 
    mutate(var = "OCCP") %>% 
    rename(j = OCCP),
  
  born_country_three %>% mutate(var = "born_country_three") %>%
    rename(j = born_country_three),

  areaenum %>% mutate(var = "AREAENUM") %>%
    rename(j = description) %>%
    select(-AREAENUM),
  
  age_five %>% mutate(var = "AGE5P") %>% 
    rename(j = AGE5P),
  
  lanp %>% mutate(var = "LANP") %>% 
    rename(j = LANP),
  
  agep %>% mutate(var = "AGEP") %>% 
    rename(j = AGEP),
  
  agep_continuous %>% 
    mutate(var = "AGE_continuous") %>%
    rename(j = "AGE_continuous") %>%
    mutate(j = as.character(j)),
  
  ancp_level1 %>% 
    mutate(var = "ANCP_1") %>% 
    rename(j = a1) %>% 
    mutate(j = as.character(j)),
  
  ancp_collapse %>%
    mutate(var = "ANCP_collapse") %>%
    rename(j = ANCP_collapse) %>%
    mutate(j = as.character(j)),

) %>%
  group_by(var, Division) %>%
  mutate(p = n / sum(n)) %>%
  ungroup() %>%
  select(-n_raw)

save(marginals,
     file = here("common_data/Census/2021/CED/marginals.RData"))

16.1 SA1

Code
sa1_marginals <- bind_rows(
  
  sa1_relp %>%
    mutate(var = "RELP_collapsed"),
  
  sa1_sexp %>%
    mutate(var = "SEXP"),
  
  sa1_heap %>%
    mutate(var = "HEAP"),
  
  sa1_hscp %>%
    mutate(var = "high_school"),
  
  #ingp %>% mutate(var = "INGP_binary") %>%
  #  rename(j = INGP_binary) %>%
  #  mutate(j = as.character(j)),
  
  sa1_incp %>%
    mutate(var = "INCP") %>% 
    mutate(j = as.character(j)),
  
  sa1_incp_continuous %>%
    mutate(var = "INCP_continuous") %>%
    mutate(j = as.character(j)),
  
  sa1_hind %>% 
    mutate(var = "HIND"),
  
  ##born_country_three %>% mutate(var = "born_country_three") %>%
  ##  rename(j = born_country_three),

  ##areaenum %>% mutate(var = "AREAENUM") %>%
  ##  rename(j = description) %>%
  ##  select(-AREAENUM),
  
  sa1_age5p %>% 
    mutate(var = "AGE5P"),

  sa1_lanp %>%
    mutate(var = "LANP"),
  
  #ced_agep_continuous %>% mutate(var = "AGE_continuous") %>%
  #  rename(j = "AGE_continuous") %>% mutate(j = as.character(j)),
  
  #ced_ancp_collapse %>% mutate(var = "ANCP_collapse") %>%
  #  rename(j = ANCP_collapse) %>% mutate(j = as.character(j)),
  
  sa1_ancp_level1 %>% 
    mutate(var = "ANCP_1") %>% 
    rename(j = a1),
  
  sa1_ancp %>% 
    mutate(var = "ANCP_collapse"),

  sa1_sexp %>% 
    mutate(var = "SEXP")
) %>% 
  group_by(var,SA1) %>% 
  mutate(p = n/sum(n)) %>% 
  ungroup()

write_fst(sa1_marginals,
          path = here(
            "common_data/Census/2021/SA1/marginals.fst")
          )

16.2 Dump for Dorian

Code
load(here("common_data/Census/2021/CED/marginals.RData"))
dorian_ced_marginals <- marginals %>% 
  filter(state == "Federal") %>% 
  filter(var %in% c("AGE5P","HEAP","INCP","SEXP")) %>% 
  select(-w)

write_csv(dorian_ced_marginals,
          file = "/tmp/dorian_ced_marginals.csv")